In [1]:
import numpy as np
import pandas as pd
import scipy as sp
In [3]:
#Loading the Dataset
state_df = pd.read_csv(r'C:\Users\nisha\Desktop\data\state_lakes.csv')
In [5]:
#First 5 values of the dataset
state_df.head()
Out[5]:
| lfenzid | lake_name | indicator | indicator_name | measurement | value | units | region | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Lake Rototoa | CHLA | Chlorophyll-a | Median | 2.250 | mg/m3 | Auckland |
| 1 | 0 | Lake Rototoa | ECOLI | E. coli | Median | 5.000 | cfu/100mL | Auckland |
| 2 | 0 | Lake Rototoa | NH4N | Ammoniacal nitrogen | Median | 0.005 | mg/L | Auckland |
| 3 | 0 | Lake Rototoa | NH4N_adj | Ammoniacal nitrogen | Median | 0.005 | mg/L | Auckland |
| 4 | 0 | Lake Rototoa | CLAR | Clarity | Median | 3.875 | m | Auckland |
In [7]:
state_df.describe()
Out[7]:
| lfenzid | value | |
|---|---|---|
| count | 592.000000 | 592.000000 |
| mean | 31293.190878 | 3.987730 |
| std | 16225.990820 | 13.924649 |
| min | 0.000000 | 0.000220 |
| 25% | 17761.000000 | 0.013875 |
| 50% | 26562.000000 | 0.475250 |
| 75% | 48177.000000 | 3.885225 |
| max | 54742.000000 | 238.000000 |
In [9]:
state_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 592 entries, 0 to 591 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 lfenzid 592 non-null int64 1 lake_name 592 non-null object 2 indicator 592 non-null object 3 indicator_name 592 non-null object 4 measurement 592 non-null object 5 value 592 non-null float64 6 units 509 non-null object 7 region 556 non-null object dtypes: float64(1), int64(1), object(6) memory usage: 37.1+ KB
In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
In [13]:
#To view Various Contamination Levels Per Regions
plt.figure(figsize=(14, 8))
sns.countplot(data=state_df, y='region', hue='indicator_name')
plt.title('Indicator Name per Region')
plt.xlabel('Count')
plt.ylabel('Region')
plt.tight_layout()
plt.show()
In [17]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def add_real_coordinates(df):
"""
Add real coordinates for lakes in the Canterbury region
"""
# Dictionary of real lake coordinates
lake_coordinates = {
'Wainono Lagoon': (-44.7015, 171.1419),
'Lake Forsyth (Lake Wairewa)': (-43.8088, 172.7176),
'Keland Pond': (-43.4545, 172.1892),
'Lake Ellesmere (Te Waihora)': (-43.7998, 172.4541),
'Coopers Lagoon': (-43.9522, 172.2157),
'Lake Rotorua (South)': (-43.9800, 171.0600)
}
# Create a DataFrame with lake coordinates
lake_coords_df = pd.DataFrame([
{'lake_name': lake, 'latitude': lat, 'longitude': lon}
for lake, (lat, lon) in lake_coordinates.items()
])
# Merge with the original dataframe
merged_df = df.merge(lake_coords_df, on='lake_name', how='left')
# If any lakes don't have coordinates in our dictionary, log them
missing_coords = merged_df[merged_df['latitude'].isna()]['lake_name'].unique()
if len(missing_coords) > 0:
print(f"Warning: Missing coordinates for lakes: {missing_coords}")
# For lakes without real coordinates, apply the simulation method
missing_lakes_df = df[df['lake_name'].isin(missing_coords)]
sim_coords = simulate_coordinates_for_missing(missing_lakes_df)
# Update the merged dataframe with simulated coordinates for missing lakes
for lake, (lat, lon) in sim_coords.items():
merged_df.loc[merged_df['lake_name'] == lake, 'latitude'] = lat
merged_df.loc[merged_df['lake_name'] == lake, 'longitude'] = lon
return merged_df
def simulate_coordinates_for_missing(df):
"""
Simulate coordinates for lakes without real coordinates
"""
# Get unique lakes missing coordinates
missing_lakes = df['lake_name'].unique()
# Canterbury region approximate center and bounds
center_lat = -43.5
center_lon = 171.75
# Create simulated positions
np.random.seed(42) # For reproducibility
coordinates = {}
for lake in missing_lakes:
# Distribute lakes across the region with some clustering
lat = center_lat + np.random.normal(0, 0.5)
lon = center_lon + np.random.normal(0, 0.75)
# Keep within realistic bounds for Canterbury
lat = max(min(lat, -42.5), -44.5)
lon = max(min(lon, 173.5), 170.0)
coordinates[lake] = (lat, lon)
return coordinates
# Function to prepare data for the bubble map
def prepare_map_data(df):
# Filter for Canterbury region and contaminants of interest
canterbury_df = df[df['region'] == 'Canterbury']
filtered_df = canterbury_df[canterbury_df['indicator_name'].isin(['Ammoniacal nitrogen', 'E. coli'])]
# Add real coordinates (with fallback to simulated for any missing lakes)
map_df = add_real_coordinates(filtered_df)
# Create a pivot table to have both contaminants as columns
pivot_df = map_df.pivot_table(
index=['lake_name', 'latitude', 'longitude'],
columns='indicator_name',
values='value',
aggfunc='mean'
).reset_index()
# Rename columns to make them more accessible
pivot_df.columns.name = None
# Handle missing values - some lakes might not have data for both contaminants
if 'Ammoniacal nitrogen' not in pivot_df.columns:
pivot_df['Ammoniacal nitrogen'] = np.nan
if 'E. coli' not in pivot_df.columns:
pivot_df['E. coli'] = np.nan
return pivot_df
# Function to create an interactive bubble map
def create_bubble_map(data, indicator):
# Define size reference for the bubbles based on contamination levels
# Normalize the values for better visualization
min_val = data[indicator].min()
max_val = data[indicator].max()
# Skip if no data
if pd.isna(min_val) or pd.isna(max_val) or min_val == max_val:
print(f"No valid data range for {indicator}")
return None
# Create normalized size values for bubbles
data['size'] = 10 + ((data[indicator] - min_val) / (max_val - min_val)) * 40
# Create hover text
data['hover_text'] = data.apply(
lambda row: f"<b>{row['lake_name']}</b><br>" +
f"{indicator}: {row[indicator]:.3f}",
axis=1
)
# Create the map
fig = px.scatter_mapbox(
data,
lat="latitude",
lon="longitude",
size="size",
color=indicator,
hover_name="lake_name",
hover_data={
"size": False,
"latitude": False,
"longitude": False,
indicator: ':.3f'
},
zoom=7,
height=700,
width=1000,
color_continuous_scale=px.colors.sequential.Blues,
title=f"Canterbury Lakes: {indicator} Contamination Levels",
center={"lat": -43.8, "lon": 171.8}, # Adjusted center for real coordinates
mapbox_style="open-street-map"
)
fig.update_layout(
margin={"r": 0, "t": 50, "l": 0, "b": 0},
coloraxis_colorbar=dict(
title=indicator,
thicknessmode="pixels", thickness=20,
lenmode="pixels", len=300,
yanchor="top", y=1,
ticks="outside"
)
)
return fig
# Function to create tabs with both contaminants
def create_combined_map(state_df):
# Prepare data
map_data = prepare_map_data(state_df)
if map_data.empty:
print("No data available for mapping")
return None
# Create a figure with subplots (tabs)
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "mapbox"}, {"type": "mapbox"}]],
subplot_titles=["Ammoniacal Nitrogen", "E. coli"]
)
# Add first map - Ammoniacal nitrogen
nitrogen_fig = create_bubble_map(map_data.copy(), "Ammoniacal nitrogen")
if nitrogen_fig:
for trace in nitrogen_fig.data:
fig.add_trace(trace, row=1, col=1)
# Add second map - E. coli
ecoli_fig = create_bubble_map(map_data.copy(), "E. coli")
if ecoli_fig:
for trace in ecoli_fig.data:
fig.add_trace(trace, row=1, col=2)
# Update layout with mapbox settings
fig.update_layout(
height=700,
width=1800,
title_text="Canterbury Lakes Contamination Levels",
title_x=0.5,
mapbox=dict(
style="open-street-map",
zoom=7,
center=dict(lat=-43.8, lon=171.8) # Adjusted center for real coordinates
),
mapbox2=dict(
style="open-street-map",
zoom=7,
center=dict(lat=-43.8, lon=171.8) # Adjusted center for real coordinates
)
)
return fig
# Function to create a single map with toggle for contaminants
def create_toggle_map(state_df):
# Prepare data
map_data = prepare_map_data(state_df)
if map_data.empty:
print("No data available for mapping")
return None
# Create a figure with buttons to toggle between indicators
nitrogen_fig = create_bubble_map(map_data.copy(), "Ammoniacal nitrogen")
ecoli_fig = create_bubble_map(map_data.copy(), "E. coli")
if not nitrogen_fig or not ecoli_fig:
print("Could not create one or both maps")
return None
# Create the figure with updatemenus for toggling
fig = go.Figure()
# Add the nitrogen data
for trace in nitrogen_fig.data:
fig.add_trace(trace)
# Add buttons for toggling between indicators
fig.update_layout(
updatemenus=[
dict(
type="buttons",
direction="right",
active=0,
x=0.5,
y=1.15,
xanchor="center",
yanchor="top",
buttons=list([
dict(
label="Ammoniacal Nitrogen",
method="update",
args=[{"visible": [True] * len(nitrogen_fig.data) + [False] * len(ecoli_fig.data)},
{"title": "Canterbury Lakes: Ammoniacal Nitrogen Contamination Levels"}]
),
dict(
label="E. coli",
method="update",
args=[{"visible": [False] * len(nitrogen_fig.data) + [True] * len(ecoli_fig.data)},
{"title": "Canterbury Lakes: E. coli Contamination Levels"}]
)
])
)
]
)
# Add the E. coli data (initially hidden)
for trace in ecoli_fig.data:
trace.visible = False
fig.add_trace(trace)
# Update layout
fig.update_layout(
height=700,
width=1000,
title_text="Canterbury Lakes: Ammoniacal Nitrogen Contamination Levels",
title_x=0.5,
margin={"r": 0, "t": 100, "l": 0, "b": 0},
mapbox=dict(
style="open-street-map",
zoom=7,
center=dict(lat=-43.8, lon=171.8) # Adjusted center for real coordinates
)
)
return fig
def create_interactive_maps(state_df):
# Create the side-by-side maps
combined_map = create_combined_map(state_df)
# Create the toggle map
toggle_map = create_toggle_map(state_df)
return combined_map, toggle_map
In [19]:
# Generate both map versions
combined_map, toggle_map = create_interactive_maps(state_df)
# Display the side-by-side maps showing both contaminants
combined_map.show()
# Display the toggle map where you can switch between contaminants
toggle_map.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: